Table. eGene Types



In [1]:

    
import copy
import cPickle
import os
import subprocess

import cdpybio as cpb
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pybedtools as pbt
import scipy.stats as stats
import seaborn as sns

import ciepy
import cardipspy as cpy

%matplotlib inline

dy_name = 'table_egene_types_and_gwas'
    
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)

import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
    dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
    cpy.makedir(dy)
    pbt.set_tempdir(dy)



In [2]:

    
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)



In [3]:

    
fn = os.path.join(ciepy.root, 'output', 'gwas_analysis', 'pe_no_hla_grasp_counts.tsv')
grasp_counts = pd.read_table(fn, index_col=0)

fn = os.path.join(ciepy.root, 'output', 'gwas_analysis', 'grasp_results.tsv')
grasp_res = pd.read_table(fn, index_col=0)
#grasp_res['phenotype'] = grasp_res.phenotype.apply(lambda x: x.split(' (')[0])

grasp_counts.index = grasp_res.ix[grasp_counts.index, 'phenotype']
grasp_res.index = grasp_res.phenotype
grasp_res = grasp_res.ix[[x for x in grasp_res.index if 'xpression' not in x]]

t = grasp_res[['lead_odds', 'pe_odds', 'pe_no_hla_odds']]
t['odds_sum'] = t.sum(axis=1)
t.sort_values(by='odds_sum', inplace=True, ascending=False)
t = t.drop('odds_sum', axis=1)

grasp_res = grasp_res.ix[t.index]

grasp_res.columns = [x.replace('pe_', 'peqtn_') for x in grasp_res.columns]
grasp_res = grasp_res[[x for x in grasp_res.columns if 'bh' not in x]].drop('phenotype', axis=1)
grasp_res = grasp_res[[x for x in grasp_res.columns if 'peqtn' not in x]]
grasp_res.columns = [x.replace('lead_', '') for x in grasp_res.columns]

grasp_res = grasp_res.sort_values(by='pvalue')



In [4]:

    
dy = os.path.join(ciepy.root, 'output/eqtl_processing/eqtls01')
fn = os.path.join(dy, 'qvalues.tsv')
qvalues = pd.read_table(fn, index_col=0)



In [5]:

    
sig = qvalues[qvalues.perm_sig]



In [6]:

    
a = gene_info.ix[sig.index, 'gene_type'].value_counts()
b = gene_info.ix[qvalues.index, 'gene_type'].value_counts()
t = pd.concat([a, b], axis=1)
t = t.fillna(0)
t.columns = ['Significant', 'Tested']
t = t[['Tested', 'Significant']]
t.sort_values(by=['Tested', 'Significant'], inplace=True, ascending=False)
t['Percent significant'] = (t.Significant / t.Tested).round(3) * 100
t['Percent eGenes'] = (t.Significant / t.Significant.sum()).round(3) * 100
t.index = [x.replace('_', ' ') for x in t.index]



In [7]:

    
t









    Out[7]:






  
    
      
      Tested
      Significant
      Percent significant
      Percent eGenes
    
  
  
    
      protein coding
      13837
      4622
      33.4
      80.4
    
    
      pseudogene
      1393
      334
      24.0
      5.8
    
    
      lincRNA
      1049
      362
      34.5
      6.3
    
    
      antisense
      926
      269
      29.0
      4.7
    
    
      processed transcript
      223
      92
      41.3
      1.6
    
    
      sense intronic
      133
      29
      21.8
      0.5
    
    
      snoRNA
      69
      4
      5.8
      0.1
    
    
      misc RNA
      59
      5
      8.5
      0.1
    
    
      sense overlapping
      54
      16
      29.6
      0.3
    
    
      snRNA
      30
      0
      0.0
      0.0
    
    
      polymorphic pseudogene
      15
      8
      53.3
      0.1
    
    
      3prime overlapping ncrna
      7
      1
      14.3
      0.0
    
    
      miRNA
      3
      1
      33.3
      0.0
    
    
      TR C gene
      2
      2
      100.0
      0.0
    
    
      IG V gene
      2
      1
      50.0
      0.0
    
    
      TR V gene
      2
      0
      0.0
      0.0
    
    
      IG V pseudogene
      1
      0
      0.0
      0.0



In [8]:

    
writer = pd.ExcelWriter(os.path.join(outdir, 'egene_types_gwas.xlsx'))
t.to_excel(writer, sheet_name='eGene summary')
grasp_res.to_excel(writer, sheet_name='GWAS enrichments')
writer.save()

	Tested	Significant	Percent significant	Percent eGenes
protein coding	13837	4622	33.4	80.4
pseudogene	1393	334	24.0	5.8
lincRNA	1049	362	34.5	6.3
antisense	926	269	29.0	4.7
processed transcript	223	92	41.3	1.6
sense intronic	133	29	21.8	0.5
snoRNA	69	4	5.8	0.1
misc RNA	59	5	8.5	0.1
sense overlapping	54	16	29.6	0.3
snRNA	30	0	0.0	0.0
polymorphic pseudogene	15	8	53.3	0.1
3prime overlapping ncrna	7	1	14.3	0.0
miRNA	3	1	33.3	0.0
TR C gene	2	2	100.0	0.0
IG V gene	2	1	50.0	0.0
TR V gene	2	0	0.0	0.0
IG V pseudogene	1	0	0.0	0.0